library(ggplot2)
library(data.table)
library(plotly, quietly = T)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

查看商家的基本统计情况:

shop_info <- read.csv("~/tianchi2017/dataset/shop_info.txt", header=FALSE)
colnames(shop_info) = c("shop_id", "city_name", "location_id", "per_pay", "score", "comment_cnt", "shop_level", "cate_1_name", "cate_2_name", "cate_3_name")
shop_info[is.na(shop_info)] = 0
summary(shop_info)
##     shop_id         city_name    location_id        per_pay     
##  Min.   :   1.0   上海   :285   Min.   :   1.0   Min.   : 1.00  
##  1st Qu.: 500.8   杭州   :225   1st Qu.: 287.8   1st Qu.: 5.00  
##  Median :1000.5   北京   :163   Median : 577.5   Median :10.00  
##  Mean   :1000.5   广州   :136   Mean   : 583.1   Mean   :10.48  
##  3rd Qu.:1500.2   南京   :130   3rd Qu.: 877.2   3rd Qu.:15.00  
##  Max.   :2000.0   武汉   :124   Max.   :1159.0   Max.   :20.00  
##                   (Other):937                                   
##      score        comment_cnt       shop_level             cate_1_name  
##  Min.   :0.000   Min.   : 0.000   Min.   :0.0000   超市便利店    : 579  
##  1st Qu.:1.000   1st Qu.: 0.000   1st Qu.:0.0000   购物          :   1  
##  Median :3.000   Median : 2.000   Median :1.0000   美发/美容/美甲:   1  
##  Mean   :2.288   Mean   : 2.675   Mean   :0.8145   美食          :1415  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:2.0000   休闲娱乐      :   2  
##  Max.   :4.000   Max.   :20.000   Max.   :2.0000   医疗健康      :   2  
##                                                                         
##    cate_2_name    cate_3_name 
##  快餐    :639           :585  
##  超市    :372   西式快餐:405  
##  便利店  :206   中式快餐:220  
##  休闲茶饮:177   生鲜水果:111  
##  小吃    :156   奶茶    : 92  
##  休闲食品:150   其它小吃: 87  
##  (Other) :300   (Other) :500
# 画图
ggplot(data=shop_info, aes(per_pay))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=shop_info, aes(score))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=shop_info, aes(comment_cnt))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=shop_info, aes(x=factor(score),y=per_pay, fill=factor(score)))+geom_violin()

ggplot(data=shop_info, aes(x=factor(score),y=comment_cnt, fill=factor(score)))+geom_violin()

查看用户浏览的基本情况:

user_view <- read.csv("~/tianchi2017/dataset/user_view.txt", header=FALSE)
colnames(user_view) = c("user_id", "shop_id", "time_stamp")
user_view$time_stamp = as.POSIXct(user_view$time_stamp, format="%Y-%m-%d %H:%M:%S",tz=Sys.timezone())
view_stat = table(user_view$shop_id)
view_stat = data.frame(shop_id=as.numeric(names(view_stat)),freq=as.vector(view_stat))
nrow(view_stat) # 有三家店没有view记录。
## [1] 1997
view_shop_info = merge(shop_info, view_stat, by.x = "shop_id", by.y = "shop_id", all = TRUE)
view_shop_info[is.na(view_shop_info)] = 0

# 画图
ggplot(data=view_shop_info, aes(x=factor(score),y=freq, fill=factor(score)))+geom_violin()

ggplot(data=view_shop_info, aes(x=factor(score),y=log2(freq+1), fill=factor(score)))+geom_violin()

ggplot(data=view_shop_info[view_shop_info$score == 2,], aes(x=factor(per_pay),y=log2(freq+1)))+geom_violin()+ggtitle("per_pay freq plot for shops with 2 star")+coord_cartesian(ylim = c(6, 16)) 

ggplot(data=view_shop_info[view_shop_info$score == 4,], aes(x=factor(per_pay),y=log2(freq+1)))+geom_violin()+ggtitle("per_pay freq plot for shops with 4 star")+coord_cartesian(ylim = c(6, 16))

ggplot(data=view_shop_info, aes(x=factor(cate_1_name),y=per_pay))+
  geom_violin()+ggtitle("一级品类名称")+
  theme(text=element_text(family="STKaiti",size=14),axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data=view_shop_info, aes(x=factor(cate_2_name),y=per_pay))+
  geom_violin()+ggtitle("二级分类名称")+
  theme(text=element_text(family="STKaiti",size=14),axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data=view_shop_info, aes(x=factor(cate_3_name),y=per_pay))+
  geom_violin()+ggtitle("三级分类名称")+
  theme(text=element_text(family="STKaiti",size=14),axis.text.x = element_text(angle = 90, hjust = 1))

summary(view_stat$freq) # 得到平均的view数量
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      40     469    1214    2783    3515   62660
# 得到view数量在平均附近的几家店
avg_shop = view_stat[abs(view_stat$freq - 2783) < 20,"shop_id"]

user_view$day_time = as.Date(user_view$time_stamp)

avg_view_time_series = data.frame(table(user_view[user_view$shop_id %in% avg_shop, c("day_time","shop_id")]))

ggplot(avg_view_time_series, aes(x=as.Date(day_time), y=Freq, col=shop_id)) + geom_line() + xlab("") + ylab("Daily Views")

view_time_series = data.frame(table(user_view[, c("day_time","shop_id")]))

ggplot(view_time_series, aes(log2(Freq+1))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

查看用户pay的情况:

#哎呀数据好大
user_pay <- fread("~/tianchi2017/dataset/user_pay.txt", header=FALSE)
colnames(user_pay) = c("user_id", "shop_id", "time_stamp")
user_pay$time_stamp = as.Date(user_pay$time_stamp)
pay_time_series = data.frame(table(user_pay[, c("time_stamp","shop_id")]))

探索天气和支付的关系

all <- read.csv(file = '~/tianchi2017/dataset/example_weather.csv')
plot_ly(all, y = ~pay,x = ~weather, type = 'box')
## Warning: Ignoring 2 observations